--- title: Error analysis keywords: fastai sidebar: home_sidebar nb_path: "projects/optiver/opt-error-analysis.ipynb" ---
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.cluster import KMeans
import opt_utils as opt
from opt_utils import *
DATA_RAW = 'input/'
MODEL = 'dart_model'
TRAIN = 'dart_model/enc_p13_train.pkl'
sns.set()
def rdf(train):
"""Adds squared percentage error for each row for our training data."""
train['spe'] = ((train['target'] - train['pred']) / train['target']) ** 2
return np.sqrt((train['spe'].sum()) / train.shape[0])
train = pd.read_pickle(TRAIN)
oof_preds = np.load(os.path.join(MODEL, 'oof_predictions.npy'))
train['pred'] = oof_preds
train['spe'] = ((train['target'] - train['pred']) / train['target']) ** 2
train['raw_error'] = train['pred'] - train['target']
train['rv'] = train['log_return_realized_volatility']
print(rdf(train))
train.head(3)
Adding the ratio of the volume of shares traded to the average total bid_size and ask_size because I think it could be important
train['ratio'] = train.size_sum / train.sum_bid_ask_mean
train[['ratio']].corrwith(train['target'])
top_err = train.sort_values('spe', ascending=False)
top_err[:100].raw_error.hist()
plt.title('top error histogram')
plt.show()
top_err[:100]['target'].mean()
train.columns.tolist()
train[['ratio', 'target']].corr()
top_err[:100]
top_err[:100][['ratio', 'target']].corr()
train.corrwith(train['target']).abs().sort_values(ascending=False)[:20]
sns.kdeplot(train['spe'])
plt.title('squared percentage error kernel density plot')
plt.show()
What the heck. This looks like I have one or a few large errors and most near zero.
train['spe'].hist()
plt.title('squared percentage error histogram')
plt.show()
Lets see the highest and lowest scoreing rows. They will show that there are a small amount of rows that are greatly increasing the error.
train['spe'].sort_values(ascending=False)
Lets look at the cumulative sum of errors to see how spread out the errors are.
df = train['spe'].sort_values().cumsum().reset_index(drop=True)
df.index /= df.index.values[-1]
df /= df.max()
df.plot()
plt.suptitle('Cumulative sum of sorted errors')
plt.xlabel('fraction of rows')
plt.ylabel('fraction of error')
plt.show()
It looks like the last 20% of the rows carry about 75% of the error.
print('Errors by stock_id sorted in descending order')
stock = train.groupby('stock_id')['spe'].sum()
stock.sort_values(ascending=False)
I was expecting the minimum to be much much lower compared to the maximum spe stock_id. The worst offender, stock_id 31 is only about 3 times worse than the second at 380 and gradually goes down to around 100 for the lowest stock. So stock 31 needs a little special attention, but we need to look at all stocks erros. I am guessing that the time_id aggregations won't be as balanced.
I am curious to see how the correlation between the target and log_return_realized_volatility (the most standard predictor) compares between the best and worst scoring stock.
train[train.stock_id == 31][['log_return_realized_volatility', 'target']].corr()
train[train.stock_id == 56][['log_return_realized_volatility', 'target']].corr()
It turns out that the worst performing stock has a lower correlation.
print('Errors by time_id sorted in descending order')
time = train.groupby('time_id')['spe'].sum()
worst_times = time.sort_values(ascending=False)
worst_times
I was correct about time_id group score being more spread than stock_id. The worst time id is about 250 times as bad as the best scoring time_id.
I am curious to see how the correlation between the target and log_return_realized_volatility (the most standard predictor) compares between the best and worst scoring time_id.
train[train.time_id == 25504][['log_return_realized_volatility', 'target']].corr()
train[train.time_id == 4432][['log_return_realized_volatility', 'target']].corr()
Similar to stock_id, the best time_id group has a higher correlation.
top_5_worst_times = worst_times[:5].index
There is a great imbalance here, with the top time_id accruing about 200 times the error of the bottom. If we can identify the reason for these errors in the hardest hit time ids, we have the best chance to improve our models.
I am suspecting that either the first 10 minute window is low volatitliy, followed by a second 10 minute window with high volatility, or vica versa. For the top time_id, 25504, lets see how the realized volatility compares to the average of the first 10 minutes, and then how the target compares to the average target.
Lets normalize the target and realized volatility of the the first 10 minutes. This way we put the variables on the same scale that is used when calculating correlation between variables.
tar_mean = train['target'].mean()
rv_mean = train.log_return_realized_volatility.mean()
tar_std = train['target'].std()
rv_std = train.log_return_realized_volatility.std()
train['norm_real_vol'] = (train.log_return_realized_volatility - rv_mean) / rv_std
train['norm_target'] = (train.target - tar_mean) / tar_std
Lets look at the top 3 worst scoring time_ids, and then the best scoring time_ids.
fig, axes = plt.subplots(3, 1, figsize=(15, 20))
train[train.time_id == 25504][['norm_real_vol', 'norm_target']].plot(ax=axes[0])
axes[0].set_title('Worst scoring time_id, 25504 normalized target and realized variance')
train[train.time_id == 27174][['norm_real_vol', 'norm_target']].plot(ax=axes[1])
axes[1].set_title('2nd worst scoring time_id, 27174 normalized target and realized variance')
train[train.time_id == 24034][['norm_real_vol', 'norm_target']].plot(ax=axes[2])
axes[2].set_title('3rd worst scoring time_id, 24034 normalized target and realized variance')
plt.show()
fig, axes = plt.subplots(3, 1, figsize=(15, 20))
train[train.time_id == 29850][['norm_real_vol', 'norm_target']].plot(ax=axes[0])
axes[0].set_title('Best scoring time_id, 29850 normalized target and realized variance')
train[train.time_id == 4432][['norm_real_vol', 'norm_target']].plot(ax=axes[1])
axes[1].set_title('2nd best scoring time_id, 4432 normalized target and realized variance')
train[train.time_id == 21116][['norm_real_vol', 'norm_target']].plot(ax=axes[2])
axes[2].set_title('3rd best scoring time_id, 21116 normalized target and realized variance')
plt.show()
I am a bit confused. The 3rd worst scoring time_id looks really bad, but the first 2 look almost like best scoring time_ids. Maybe there is something I can't see yet that is hurting my model. Lets plot the same plots again, along with our predictions. We will have to scale the predictions with the the same mean and std of the target to put them on the same scale.
train['norm_pred'] = (train.pred - tar_mean) / tar_std
fig, axes = plt.subplots(3, 1, figsize=(15, 20))
train[train.time_id == 25504][['norm_real_vol', 'norm_target', 'norm_pred']].plot(ax=axes[0])
axes[0].set_title('Worst scoring time_id, 25504 normalized target and realized variance')
train[train.time_id == 27174][['norm_real_vol', 'norm_target', 'norm_pred']].plot(ax=axes[1])
axes[1].set_title('2nd worst scoring time_id, 27174 normalized target and realized variance')
train[train.time_id == 24034][['norm_real_vol', 'norm_target', 'norm_pred']].plot(ax=axes[2])
axes[2].set_title('3rd worst scoring time_id, 24034 normalized target and realized variance')
plt.show()
fig, axes = plt.subplots(3, 1, figsize=(15, 20))
train[train.time_id == 29850][['norm_real_vol', 'norm_target', 'norm_pred']].plot(ax=axes[0])
axes[0].set_title('Best scoring time_id, 29850 normalized target and realized variance')
train[train.time_id == 4432][['norm_real_vol', 'norm_target', 'norm_pred']].plot(ax=axes[1])
axes[1].set_title('2nd best scoring time_id, 4432 normalized target and realized variance')
train[train.time_id == 21116][['norm_real_vol', 'norm_target', 'norm_pred']].plot(ax=axes[2])
axes[2].set_title('3rd best scoring time_id, 21116 normalized target and realized variance')
plt.show()
I am still confused about the third worst time_id. It looks like the predictions follow the first 10 minute volatility, and should be far off. Maybe I am missing something by normalizing. Lets just look at the raw values of preds and the target, along with the error plotted underneath for reference.
fig, axes = plt.subplots(6, 1, figsize=(15, 35))
train[train.time_id == 25504][['target', 'pred']].plot(ax=axes[0])
axes[0].set_title('Worst scoring time_id, 25504 normalized target and realized variance')
train[train.time_id == 25504][['spe']].plot(ax=axes[1])
train[train.time_id == 27174][['target', 'pred']].plot(ax=axes[2])
axes[2].set_title('2nd worst scoring time_id, 27174 normalized target and realized variance')
train[train.time_id == 27174][['spe']].plot(ax=axes[3])
train[train.time_id == 24034][['target', 'pred']].plot(ax=axes[4])
axes[4].set_title('3rd worst scoring time_id, 24034 normalized target and realized variance')
train[train.time_id == 24034][['spe']].plot(ax=axes[5])
plt.show()
key takeaway: the largest errors are caused when the target is small because the errors are divided by the target in the metric calculation. The top two worst scoring time_ids both have a single prediction with an extremely low target, while the 3rd worst time_id has a generally low target for all stocks.
Lets start with the top 50 errors
fig, axes = plt.subplots(2, 1, figsize=(15, 10))
top = train.sort_values('spe', ascending=False).head(50).reset_index()
top[['target', 'pred', 'log_return_realized_volatility']].plot(ax=axes[0])
axes[0].set_title('Top 25 squared percentage errors')
top[['spe']].plot(ax=axes[1])
axes[1].set_title('spe')
plt.show()
It definitely seems that the largest errors are coming from overpredicting the target, along with an especially small target. I think overprediction could be the only times when we have a large penalty from the metric. Lets look at the largest losses where we underpredict.
df = train[train.pred < train.target]
df.sort_values('spe', ascending=False)[['pred', 'target', 'spe']].head(3)
The largest spe is less than .92. For reference, if we predicted 0 for all targets our average spe and overall metric would be 1. Given that the underpredicted errors gave spe penalties in the 5,6, and 7 range at the high end (highest about 200) with relatively small absolute error, and these underpredictions give a max penalty less than 1, I am certain that overpredicting especially small targets is the largest source of error.
Lets look at some of the book and trade data for the top losses. Maybe we can find some features that show that we are about to enter a low volatility period, even if the previous 10 minutes was high.
def add_percentile(df, col):
"""Adds `{col}_percentile` to compare where each row ranks in terms of `col`."""
if type(col) == list:
for c in col: df = add_percentile(df, c)
else:
df[col + '_percentile'] = (df[col].rank(pct=True) * 100).astype(int)
return df
add_percentile(train, ['pred', 'target', 'log_return_realized_volatility'])
top_spe = train.sort_values('spe', ascending=False)
top_spe.head()
def plot_book(stock_id, time_id, train=train):
"""Plots book and trade data for single stock and time combination.
This is to try and tease out indicators for especially large errors."""
mask = (train.stock_id == stock_id) & (train.time_id == time_id)
target, pred, spe, realized_volatitlity, pred_percentile, target_percentile, \
realized_volatitlity_percentile, size_sum, ratio \
= train.loc[mask, ['target', 'pred', 'spe',
'log_return_realized_volatility',
'pred_percentile',
'target_percentile',
'log_return_realized_volatility_percentile',
'size_sum',
'ratio'
]].values[0].round(5)
bt = load_bt(DATA_RAW, stock_id, 'train')
bt['wap'] = (bt['bid_price1'] * bt['ask_size1'] + bt['ask_price1'] * bt['bid_size1']) /\
(bt['bid_size1'] + bt['ask_size1'])
bt['total_bid_size'] = bt['bid_size1'] + bt['bid_size2']
bt['total_ask_size'] = bt['ask_size1'] + bt['ask_size2']
book = bt[bt.time_id == time_id].set_index('seconds_in_bucket')
fig, axes = plt.subplots(3, 1, figsize=(15, 15))
cmap = {'ask_price1': 'yellow',
'ask_price2': 'khaki',
'ask_size1': 'yellow',
'ask_size2': 'khaki',
'bid_price1': 'blue',
'bid_price2': 'lightblue',
'bid_size1': 'blue',
'bid_size2': 'lightblue',
'total_ask_size': 'yellow',
'total_bid_size': 'blue',
'wap': 'green',
'size': 'red',
'price': 'red',
}
book[['ask_price2', 'ask_price1', 'bid_price1', 'bid_price2', 'wap', 'price']].plot(ax=axes[0], color=cmap)
book[['bid_size1', 'ask_size1', 'bid_size2','ask_size2']].plot(ax=axes[1], color=cmap)
book[['total_bid_size', 'total_ask_size']].plot(ax=axes[2], color=cmap)
book[['size']].plot(marker='o', ax=axes[1], color=cmap)
axes[0].set_title('Price info')
axes[1].set_title('Size info')
axes[2].set_title('Total size info')
axes[0].legend(loc='upper left')
axes[1].legend(loc='upper left')
axes[2].legend(loc='upper left')
fig.suptitle(f'\
Stock: {stock_id} missing seconds: {600 - len(book)}\n\
Time_id: {time_id}\n\
spe: {spe} \n\
target: {target} \n\
pred: {pred} \n\
realized_volatitlity: {realized_volatitlity},\n\n\
error: {round(pred - target, 5)} \n\
target_percentile: {target_percentile} \n\
pred_percentile: {pred_percentile} \n\
realized_volatitlity_percentile: {realized_volatitlity_percentile} total sales: {size_sum} ratio: {ratio}\n')
plt.tight_layout()
plt.show()
train['d'] = train['pred'] - train['target']
for stock_id, time_id in train.sort_values('d', ascending=False)[['stock_id', 'time_id']].values[: 10]:
plot_book(stock_id, time_id)
def mean_decay(x, decay=.9, step=-1, axis=0):
"""Returns sum with exponential decay, step = -1
for the end of the array to matter the most."""
weights = np.power(decay, np.arange(x.shape[axis])[::step]).astype(np.float32)
return np.sum(weights * x, axis=axis) / weights.sum()
for t in top_5_worst_times:
tmp = train[train.time_id == t].sort_values('spe', ascending=False)
for stock_id, time_id in tmp[['stock_id', 'time_id']].values[: 3]:
plot_book(stock_id, time_id)
Lets look at the most penalized instances and see if we can see a pattern.
low = train.sort_values('log_return_realized_volatility')
for stock_id, time_id in low[['stock_id', 'time_id']].values[: 10]:
plot_book(stock_id, time_id)
They all have a very low ratio of sales to ask and bid size
for stock_id, time_id in top_spe[top_spe.stock_id != 31][['stock_id', 'time_id']].values[: 10]:
plot_book(stock_id, time_id)
for stock_id, time_id in train[train.stock_id == 31][['stock_id', 'time_id']].values[: 10]:
plot_book(stock_id, time_id)
comp = train[(train.log_return_realized_volatility_percentile < 60) & (train.log_return_realized_volatility_percentile > 40)]
for stock_id, time_id in comp[['stock_id', 'time_id']].values[:10]:
plot_book(stock_id, time_id)
I can't see any obvious patterns in these plots that I could use to create different features than I already have.